home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
IRIX Installation Tools & Overlays 2002 November
/
SGI IRIX Installation Tools & Overlays 2002 November - Disc 4.iso
/
dist
/
infosearch.idb
/
usr
/
lib
/
infosearch
/
bin
/
mknmz.z
/
mknmz
Wrap
Text File
|
2002-10-15
|
69KB
|
2,473 lines
#! /usr/bin/perl5 -w
# -*- Perl -*-
# mknmz - indexer of Namazu
# $Id: mknmz,v 1.1 2002/08/14 15:51:10 agd Exp $
#
# Copyright (C) 1997-1999 Satoru Takabayashi All rights reserved.
# Copyright (C) 2000,2001 Namazu Project All rights reserved.
# This is free software with ABSOLUTELY NO WARRANTY.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either versions 2, or (at your option)
# any later version.
#
# This program is distributed in the hope that it will be useful
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
# 02111-1307, USA
#
# This file must be encoded in EUC-JP encoding
#
package mknmz;
require 5.004;
use lib ".";
use Cwd;
use IO::File;
use File::Find;
require "/usr/lib/infosearch/l10n/namazu/pm/MMagic.pm";
#use File::MMagic;
use Time::Local;
use strict; # be strict since v1.2.0
use Getopt::Long;
use File::Copy;
use DirHandle;
use vars qw($SYSTEM);
$SYSTEM = $^O;
my $NAMAZU_INDEX_VERSION = "2.0";
my $CodingSystem = "euc";
my $PKGDATADIR = $ENV{'pkgdatadir'} || "/usr/lib/infosearch/l10n/namazu";
my $CONFDIR = "/usr/lib/infosearch/l10n/namazu/etc"; # directory where mknmzrc are in.
my $LIBDIR = $PKGDATADIR . "/pl"; # directory where library etc. are in.
my $FILTERDIR = $PKGDATADIR . "/filter"; # directory where filters are in.
my $TEMPLATEDIR = $PKGDATADIR . "/template"; # directory where templates are in.
my $DeletedFilesCount = 0;
my $UpdatedFilesCount = 0;
my $APPENDMODE = 0;
my %PhraseHash = ();
my %PhraseHashLast = ();
my %KeyIndex = ();
my %KeyIndexLast = ();
my %CheckPoint = ("on" => undef, "continue" => undef);
my $ConfigFile = undef;
my $MediaType = undef;
my $ReplaceCode = undef; # perl code for transforming URI
my @Seed = ();
my @LoadedRcfiles = ();
my $Magic = new File::MMagic;
my $ReceiveTERM = 0;
STDOUT->autoflush(1);
STDERR->autoflush(1);
main();
sub main {
my $start_time = time;
init();
# At first, loading pl/conf.pl to prevent overriding some variables.
preload_modules();
# set LANG and bind textdomain
util::set_lang();
textdomain('namazu', $util::LANG_MSG);
load_rcfiles();
load_modules();
my ($output_dir, @targets) = parse_options();
my ($docid_base, $total_files_num) = prep($output_dir, @targets);
my $swap = 1;
my $docid_count = 0;
my $file_count = 0;
my $total_files_size = 0;
my $key_count = 0;
my $checkpoint = 0;
my $flist_ptr = 0;
my $processed_files_size = 0;
if ($CheckPoint{'continue'}) {
# Restore variables
eval util::readfile($var::NMZ{'_checkpoint'}) ;
} else {
print $total_files_num . _(" files are found to be indexed.\n");
}
{
my $fh_errorsfile = util::efopen(">>$var::NMZ{'err'}");
my $fh_flist = util::efopen($var::NMZ{'_flist'});
my %field_indices = ();
get_field_index_base(\%field_indices);
if ($CheckPoint{'continue'}) {
seek($fh_flist, $flist_ptr, 0);
}
# Process target files one by one
while (defined(my $line = <$fh_flist>)) {
$flist_ptr += length($line);
my $cfile = $line;
chomp $cfile;
util::dprint(_("target file: ")."$cfile\n");
my ($cfile_size, $num) =
process_file($cfile, $docid_count, $docid_base,
$file_count, \%field_indices,
$fh_errorsfile, $total_files_num);
if ($num == 0) {
$total_files_num--;
next;
} else {
$docid_count += $num;
$file_count++;
}
$total_files_size += $cfile_size;
$processed_files_size += $cfile_size;
last if $ReceiveTERM;
if ($processed_files_size > $conf::ON_MEMORY_MAX) {
if (%KeyIndex) {
$key_count = write_index();
print _("Writing index files...");
write_phrase_hash();
print "\n";
}
$processed_files_size = 0;
$checkpoint = 1, last if $CheckPoint{'on'} && defined(<$fh_flist>);
}
}
}
# This should be out of above blocks because of file handler closing.
re_exec($flist_ptr, $docid_count, $docid_base, $start_time,
$total_files_size, $total_files_num,
$file_count, $key_count) if $checkpoint;
if (%KeyIndex) {
$key_count = write_index();
print _("Writing index files...");
write_phrase_hash();
print "\n";
}
$key_count = get_total_keys() unless $key_count;
do_remain_job($total_files_size, $docid_count, $key_count,
$start_time);
exit 0;
}
#
# FIXME: Very complicated.
#
sub process_file ($$$$$$) {
my ($cfile, $docid_count, $docid_base, $file_count,
$field_indices, $fh_errorsfile, $total_files_num) = @_;
my $processed_num = 0;
my $file_size = util::filesize($cfile);
if ($var::Opt{'htmlsplit'} && $cfile =~ $conf::HTML_SUFFIX) {
my @parts = htmlsplit::split($cfile, "NMZ.partial");
if (@parts > 1) {
my $id = 0;
for my $part (@parts) {
my $fname = util::tmpnam("NMZ.partial.$id");
my $fragment = defined $part ? $part : undef;
my $uri = generate_uri($cfile, $fragment);
my $result = namazu_core($fname,
$docid_count + $processed_num,
$docid_base, $file_count,
$field_indices, $fh_errorsfile,
$total_files_num,
$uri, $id, $#parts);
if ($result > 0) {
$processed_num++;
my $rname = defined $part ? "$cfile\t$part" : "$cfile";
put_registry($rname);
}
unlink $fname;
$id++;
}
return ($file_size, $processed_num);
}
}
my $result = namazu_core($cfile, $docid_count, $docid_base,
$file_count, $field_indices,
$fh_errorsfile, $total_files_num,
undef, undef, undef);
if ($result > 0) {
$processed_num++;
put_registry($cfile);
}
return ($file_size, $processed_num);
}
#
# Load mknmzrcs:
#
# 1. MKNMZRC environment
#
# 2. $(sysconfdir)/$(PACKAGE)/mknmzrc
#
# 3. ~/.mknmzrc
#
# 4. user-specified mknmzrc set by mknmz --config=file option.
#
# If multiple files exists, read all of them.
#
sub load_rcfiles () {
my (@cand) = ();
# To support Windows. Since they have nasty drive letter convention,
# it is necessary to change mknmzrc dynamically with env. variable.
push @cand, $ENV{'MKNMZRC'} if defined $ENV{'MKNMZRC'};
push @cand, "$CONFDIR/mknmzrc";
push @cand, "$ENV{'HOME'}/.mknmzrc";
util::vprint(_("Reading rcfile: "));
for my $rcfile (@cand) {
if (-f $rcfile) {
load_rcfile ($rcfile);
util::vprint(" $rcfile");
}
}
util::vprint("\n");
}
sub load_rcfile ($) {
my ($rcfile) = @_;
if ($SYSTEM eq "MSWin32" || $SYSTEM eq "os2") {
# convert \ to / with consideration for Shift_JIS Kanji code
$rcfile =~
s!([\x81-\x9f\xe0-\xef][\x40-\x7e\x80-\xfc]|[\x01-\x7f])!
$1 eq "\\" ? "/" : $1!gex;
}
return if (grep {m/^$rcfile$/} @LoadedRcfiles);
do $rcfile;
push @LoadedRcfiles, $rcfile;
# Dirty workaround.
$LIBDIR = $conf::LIBDIR
if (defined $conf::LIBDIR && -d $conf::LIBDIR);
$FILTERDIR = $conf::FILTERDIR
if (defined $conf::FILTERDIR && -d $conf::FILTERDIR);
$TEMPLATEDIR = $conf::TEMPLATEDIR
if (defined $conf::TEMPLATEDIR && -d $conf::TEMPLATEDIR);
}
sub re_exec($$$$$$$$) {
my ($flist_ptr, $docid_count, $docid_base, $start_time,
$total_files_size, $total_files_num, $file_count, $key_count) = @_;
# store variables
{
my $fh_checkpoint = util::efopen(">$var::NMZ{'_checkpoint'}");
print $fh_checkpoint <<EOM;
\$DeletedFilesCount = $DeletedFilesCount;
\$UpdatedFilesCount = $UpdatedFilesCount;
\$APPENDMODE = $APPENDMODE;
\$flist_ptr = $flist_ptr;
\$docid_count = $docid_count;
\$docid_base = $docid_base;
\$start_time = $start_time;
\$total_files_size = $total_files_size;
\$total_files_num = $total_files_num;
\$key_count = $key_count;
\$file_count = $file_count;
\$\$ = $$;
EOM
}
@ARGV = ("-S", @ARGV) ;
print _("Checkpoint reached: re-exec mknmz...\n");
util::dprint(join ' ', ("::::", @ARGV, "\n"));
exec ($0, @ARGV) ;
}
sub put_registry ($) {
my ($filename) = @_;
my $fh_registry = util::efopen(">>$var::NMZ{'_r'}");
print $fh_registry $filename, "\n";
}
# Initialization
# $CodingSystem: Character Coding System 'euc' or 'sjis'
sub init () {
$SYSTEM = $^O;
if (($SYSTEM eq "MSWin32") || ($SYSTEM eq "os2")) {
$CodingSystem = "sjis";
if ($CONFDIR !~ /^[A-Z]:|^\\\\/i && $0 =~ m#^([A-Z]:)(/|\\)#i) {
$CONFDIR = $1 . $CONFDIR ;
}
if ($LIBDIR !~ /^[A-Z]:|^\\\\/i && $0 =~ m#^([A-Z]:)(/|\\)#i) {
$LIBDIR = $1 . $LIBDIR ;
}
if ($FILTERDIR !~ /^[A-Z]:|^\\\\/i && $0 =~ m#^([A-Z]:)(/|\\)#i) {
$FILTERDIR = $1 . $FILTERDIR ;
}
if ($TEMPLATEDIR !~ /^[A-Z]:|^\\\\/i && $0 =~ m#^([A-Z]:)(/|\\)#i) {
$TEMPLATEDIR = $1 . $TEMPLATEDIR ;
}
} else {
$CodingSystem = "euc";
}
$SIG{'INT'} = sub {
util::cdie("SIGINT caught! Aborted.\n");
};
$SIG{'TERM'} = sub {
print STDERR "SIGTERM caught!\n";
$ReceiveTERM = 1;
};
}
sub preload_modules () {
unshift @INC, $LIBDIR;
# workaround for test suites.
unshift @INC, $ENV{'top_builddir'} . "/pl" if defined $ENV{'top_builddir'};
require "conf.pl" || die "unable to require \"conf.pl\"\n";
require "util.pl" || die "unable to require \"util.pl\"\n";
require "gettext.pl" || die "unable to require \"gettext.pl\"\n";
}
sub postload_modules () {
require "htmlsplit.pl" || die "unable to require \"htmlsplit.pl\"\n";
}
sub load_modules () {
require "var.pl" || die "unable to require \"var.pl\"\n";
require "usage.pl" || die "unable to require \"usage.pl\"\n";
require "codeconv.pl" || die "unable to require \"codeconv.pl\"\n";
require "wakati.pl" || die "unable to require \"wakati.pl\"\n";
require "seed.pl" || die "unable to require \"seed.pl\"\n";
require "gfilter.pl" || die "unable to require \"gfilter.pl\"\n";
@Seed = seed::init();
}
sub load_filtermodules () {
unshift @INC, $FILTERDIR;
#
# Windows modules must be loaded first.
# Because OLE filters have low precedence over normal ones.
#
load_win32modules() if $SYSTEM eq "MSWin32";
# Check filter modules
my @filters = ();
@filters = glob "$FILTERDIR/*.pl";
load_filters(@filters);
}
sub load_win32modules () {
# Check filter modules
my @filters = ();
if (-f "../filter/win32/olemsword.pl") { # to ease developing
@filters = glob "../filter/win32/*.pl";
unshift @INC, "../filter/win32";
} else {
@filters = glob "$FILTERDIR/win32/*.pl";
unshift @INC, "$FILTERDIR/win32";
}
load_filters(@filters);
}
sub load_filters (@) {
my @filters = @_;
for my $filter (@filters) {
$filter =~ m!([-\w]+)\.pl$!;
my $module = $1;
require "$module.pl" || die "unable to require \"$module.pl\"\n";;
my (@mtypes, $status, $recursive, $pre_codeconv, $post_codeconv);
eval "\@mtypes = ${module}::mediatype();";
die $@ if $@; # eval error
eval "\$status = ${module}::status();";
die $@ if $@;
eval "\$recursive = ${module}::recursive();";
die $@ if $@;
eval "\$pre_codeconv = ${module}::pre_codeconv();";
die $@ if $@;
eval "\$post_codeconv = ${module}::post_codeconv();";
die $@ if $@;
eval "${module}::add_magic(\$Magic);";
die $@ if $@;
for my $mt (@mtypes) {
next if (defined $var::Supported{$mt} &&
$var::Supported{$mt} eq 'yes' && $status eq 'no');
$var::Supported{$mt} = $status;
$var::REQUIRE_ACTIONS{$mt} = $module;
$var::RECURSIVE_ACTIONS{$mt} = $recursive;
$var::REQUIRE_PRE_CODECONV{$mt} = $pre_codeconv;
$var::REQUIRE_POST_CODECONV{$mt} = $post_codeconv;
}
}
}
# Core routine.
#
# FIXME: Too many parameters. They must be cleared.
#
sub namazu_core ($$$$$$$$$$) {
my ($cfile, $docid_count, $docid_base,
$file_count, $field_indices, $fh_errorsfile, $total_files_num,
$uri, $part_id, $part_num) = @_;
my $headings = "";
my $content = "";
my $weighted_str = "";
my %fields;
my $msg_prefix;
if ($part_id) {
$msg_prefix = " $part_id/$part_num - ";
} else {
$msg_prefix = $file_count + 1 . "/$total_files_num - ";
}
unless ($uri) {
$uri = generate_uri($cfile); # Make a URI from a file name.
}
my ($cfile_size, $text_size, $kanji, $mtype) =
load_document(\$cfile, \$content, \$weighted_str,
\$headings, \%fields);
util::dprint(_("after load_document: ")."$uri: $cfile_size, $text_size, $kanji, $mtype\n");
# Check if the file is acceptable.
my $err = check_file($cfile, $cfile_size, $text_size, $mtype, $uri);
if (defined $err) {
if (($SYSTEM eq "MSWin32") || ($SYSTEM eq "os2")) {
my $uri2 = codeconv::eucjp_to_shiftjis($uri);
print $msg_prefix . "$uri2 $err\n";
} else {
print $msg_prefix . "$uri $err\n";
}
print $fh_errorsfile "$cfile $err\n";
return 0; # return 0 if error
}
# Print processing file name as URI.
if (($SYSTEM eq "MSWin32") || ($SYSTEM eq "os2")) {
my $uri2 = codeconv::eucjp_to_shiftjis($uri);
print $msg_prefix . "$uri2 [$mtype]\n";
} else {
print $msg_prefix . "$uri [$mtype]\n";
}
complete_field_info(\%fields, $cfile, $uri,
\$headings, \$content, \$weighted_str);
put_field_index(\%fields, $field_indices);
put_dateindex($cfile);
$content .= $weighted_str; # add weights
count_words($docid_count, $docid_base, \$content, $kanji);
make_phrase_hash($docid_count, $docid_base, \$content);
# assertion
util::assert($cfile_size != 0,
"cfile_size == 0 at the end of namazu_core.");
return $cfile_size;
}
#
# Make the URI from the given file name.
#
sub generate_uri (@) {
my ($file, $fragment) = @_;
return "" unless defined $file;
# omit a file name if omittable
$file =~ s!^(.*)/($conf::DIRECTORY_INDEX)$!$1/!o;
if (defined $ReplaceCode) {
# transforming URI by evaling
$_ = $file;
eval $ReplaceCode;
$file = $_;
}
if (($SYSTEM eq "MSWin32") || ($SYSTEM eq "os2")) {
$file =~ s#^([A-Z]):#/$1|#i; # converting a drive part like: /C|
}
if (($SYSTEM eq "MSWin32") || ($SYSTEM eq "os2")) {
$file = codeconv::shiftjis_to_eucjp($file);
}
if (defined $fragment) {
codeconv::toeuc(\$fragment);
}
unless ($var::Opt{'noencodeuri'}) {
for my $tmp ($file, $fragment) {
next unless defined $tmp;
# Escape unsafe characters (not strict)
$tmp =~ s/\%/%25/g; # Convert original '%' into '%25' v1.1.1.2
$tmp =~ s/([^a-zA-Z0-9~\-\_\.\/\:\%])/
sprintf("%%%02X",ord($1))/ge;
}
}
my $uri = $file;
$uri .= "#" . $fragment if defined $fragment;
if (($SYSTEM eq "MSWin32") || ($SYSTEM eq "os2")) {
# restore '|' for drive letter rule of Win32, OS/2
$uri =~ s!^/([A-Z])%7C!/$1|!i;
}
return $uri;
}
sub get_field_index_base (\%) {
my ($field_indices) = @_;
my @keys = split('\|', $conf::SEARCH_FIELD);
if ($var::Opt{'meta'}) {
push @keys, (split '\|', $conf::META_TAGS);
}
for my $key (@keys) {
$key = lc($key);
my $fname = "$var::NMZ{'field'}.$key";
my $tmp_fname = util::tmpnam("NMZ.field.$key");
my $size = 0;
$size = -s $fname if -f $fname;
$size += -s $tmp_fname if -f $tmp_fname;
$field_indices->{$key} = $size;
}
}
sub complete_field_info (\%$$\$\$\$) {
my ($fields, $cfile, $uri, $headings, $contref, $wsref) = @_;
unless (defined($fields->{'title'})) {
$fields->{'title'} = gfilter::filename_to_title($cfile, $wsref);
}
unless (defined($fields->{'date'})) {
my $mtime = (stat($cfile))[9];
my $date = util::rfc822time($mtime);
$fields->{'date'} = $date;
}
unless (defined($fields->{'uri'})) {
$fields->{'uri'} = $uri;
}
unless (defined($fields->{'size'})) {
$fields->{'size'} = -s $cfile;
}
unless (defined($fields->{'summary'})) {
$fields->{'summary'} = make_summary($contref, $headings, $cfile);
}
unless (defined($fields->{'from'}) || defined($fields->{'author'})) {
$fields->{'from'} = getmsg("unknown");
}
}
#
# Currently, messages for NMZ.* files should be encoded in
# EUC-JP currently. We cannot use gettext.pl for the messsage
# because gettext.pl may use Shift_JIS encoded messages.
# So, we should use the function instead of gettext().
#
# FIXME: Ad hoc impl. getmsg() is effective only for "unknown".
#
sub getmsg($) {
my ($msg) = @_;
if (util::islang_msg("ja")) {
if ($msg eq "unknown") {
return "╔╘╠└";
}
}
return $msg;
}
sub make_summary ($$$) {
my ($contref, $headings, $cfile) = @_;
# pick up $conf::MAX_FIELD_LENGTH bytes string
my $tmp = "";
if ($$headings ne "") {
$$headings =~ s/^\s+//;
$$headings =~ s/\s+/ /g;
$tmp = $$headings;
} else {
$tmp = "";
}
my $offset = 0;
my $tmplen = 0;
while (($tmplen = $conf::MAX_FIELD_LENGTH + 1 - length($tmp)) > 0
&& $offset < length($$contref))
{
$tmp .= substr $$contref, $offset, $tmplen;
$offset += $tmplen;
$tmp =~ s/(([\xa1-\xfe]).)/$2 eq "\xa8" ? '': $1/ge;
$tmp =~ s/([-=*\#])\1{2,}/$1$1/g;
}
# -1 means "LF"
my $summary = substr $tmp, 0, $conf::MAX_FIELD_LENGTH - 1;
# Remove a garbage Kanji 1st char at the end.
$summary = codeconv::chomp_eucjp($summary);
$summary =~ s/^\s+//;
$summary =~ s/\s+/ /g; # normalize white spaces
return $summary;
}
# output the field infomation into NMZ.fields.* files
sub put_field_index (\%$) {
my ($fields, $field_indices) = @_;
my $aliases_regex =
join('|', sort {length($b) <=> length($a)} keys %conf::FIELD_ALIASES);
for my $field (keys %{$fields}) {
util::dprint("Field: $field: $fields->{$field}\n");
if ($field =~ /^($aliases_regex)$/o) {
unless (defined($fields->{$conf::FIELD_ALIASES{$field}})) {
$fields->{$conf::FIELD_ALIASES{$field}} = $fields->{$field};
}
undef $fields->{$field};
}
}
my @keys = split '\|', $conf::SEARCH_FIELD;
if ($var::Opt{'meta'}) {
push @keys, (split '\|', $conf::META_TAGS);
# uniq @keys
my %mark = ();
@keys = grep {$mark{$_}++; $mark{$_} == 1} @keys;
}
for my $key (@keys) {
my $lkey = lc($key);
my $fname = util::tmpnam("NMZ.field.$lkey");
my $fh_field = util::efopen(">>$fname");
my $output = "";
if (defined($fields->{$key})) {
if ($key ne 'uri') { # workaround for namazu-bugs-ja#30
$fields->{$key} =~ s/\s+/ /g;
$fields->{$key} =~ s/\s+$//;
$fields->{$key} =~ s/^\s+//;
}
$output = $fields->{$key};
# -1 means "LF"
$output = substr $output, 0, $conf::MAX_FIELD_LENGTH - 1;
# Remove a garbage Kanji 1st char at the end.
$output = codeconv::chomp_eucjp($output);
$output .= "\n";
} else {
$output = "\n";
}
print $fh_field $output;
# put index of field index
{
my $fname = util::tmpnam("NMZ.field.$lkey.i");
my $fh_field_idx = util::efopen(">>$fname");
print $fh_field_idx pack("N", $field_indices->{$lkey});
$field_indices->{$lkey} += length $output;
}
}
}
# put the date infomation into NMZ.t file
sub put_dateindex ($) {
my ($cfile) = @_;
my $mtime = (stat($cfile))[9];
my $fh_dataindex = util::efopen(">>$var::NMZ{'_t'}");
print $fh_dataindex pack("N", $mtime);
}
# load a document file
sub load_document ($$$$\%) {
my ($orig_cfile, $contref, $weighted_str, $headings, $fields)
= @_;
my $cfile = $$orig_cfile;
return (0, 0, 0, 0) unless (-f $cfile && -r $cfile);
# for handling a filename which contains Shift_JIS code
my $shelter_cfile = "";
if ($SYSTEM eq "MSWin32"
&& $cfile =~ /[\x81-\x9f\xe0-\xef][\x40-\x7e\x80-\xfc]|[\x20\xa1-\xdf]/)
{
$shelter_cfile = $cfile;
$cfile = util::tmpnam("NMZ.win32");
copy($shelter_cfile, $cfile);
}
my $file_size;
$file_size = util::filesize($cfile); # not only file in feature.
if ($file_size > $conf::FILE_SIZE_MAX) {
return ($file_size, $file_size, 0, 'x-system/x-error');
}
$$contref = util::readfile($cfile);
# $file_size = length($$contref);
# Filtering process.
my $mtype;
my $called_dt = 0;
while (1) {
if (defined $MediaType) {
$mtype = $MediaType;
} else {
my $mtype_n;
if ($shelter_cfile ne "") {
$mtype_n = $Magic->checktype_byfilename($shelter_cfile);
} else {
$mtype_n = $Magic->checktype_byfilename($cfile);
}
my $mtype_c = $Magic->checktype_data($$contref);
my $mtype_m;
$mtype_m = $Magic->checktype_magic($$contref)
if ((! defined $mtype_c) ||
$mtype_c =~
/^(text\/html|text\/plain|application\/octet-stream)$/);
$mtype_c = $mtype_m
if (defined $mtype_m &&
$mtype_m !~
/^(text\/html|text\/plain|application\/octet-stream)$/);
$mtype_c = 'text/plain' unless defined $mtype_c;
if ($called_dt) {
$mtype = $mtype_c;
} else {
$mtype = decide_type($mtype_n, $mtype_c);
$called_dt = 1;
}
}
util::dprint(_("Detected type: ")."$mtype\n");
# Pre code conversion.
if ($var::REQUIRE_PRE_CODECONV{$mtype}) {
util::dprint("pre_codeconv\n");
codeconv_document($contref);
}
if (! $var::Supported{$mtype} ||
$var::Supported{$mtype} ne 'yes')
{
util::vprint(_("Unsupported media type ")."$mtype\n");
return ($file_size, $file_size, 0, "$mtype; x-system=unsupported");
}
if ($var::REQUIRE_ACTIONS{$mtype}) {
util::vprint(_("Using ")."$var::REQUIRE_ACTIONS{$mtype}.pl\n");
require $var::REQUIRE_ACTIONS{$mtype}.'.pl'
|| die _("unable to require ") .
"\"$var::REQUIRE_ACTIONS{$mtype}.pl\"\n";
my $err = undef;
eval '$err = ' . $var::REQUIRE_ACTIONS{$mtype} .
'::filter($orig_cfile, $contref, $weighted_str, $headings, $fields);';
if ($err) {
return ($file_size, $file_size, 0, "$mtype; x-error=$err");
}
if ($@) {
util::vprint(_("Failed to call ")."$var::REQUIRE_ACTIONS{$mtype}\n$@\n");
return ($file_size, $file_size, 0, "$mtype; x-error=$@");
}
# Post code conversion.
if ($var::REQUIRE_POST_CODECONV{$mtype}) {
util::dprint("post_codeconv\n");
codeconv_document($contref);
}
next if ($var::RECURSIVE_ACTIONS{$mtype});
}
last;
}
# Measure the text size at this time.
my $text_size = length($$contref) + length($$weighted_str);
if ($SYSTEM eq "MSWin32" && $shelter_cfile ne "") {
unlink $cfile;
$cfile = $shelter_cfile;
}
my $kanji = $$contref =~ tr/\xa1-\xfe/\xa1-\xfe/; # Kanji contained?
$kanji += $$weighted_str =~ tr/\xa1-\xfe/\xa1-\xfe/;
return ($file_size, $text_size, $kanji, $mtype);
}
sub codeconv_document ($) {
my ($textref) = @_;
codeconv::toeuc($textref);
$$textref =~ s/\r\n/\n/g;
$$textref =~ s/\r/\n/g;
}
sub prep () {
my $docid_base = 0;
my $output_dir = shift @_ ;
my @targets = @_ ;
my @flist = ();
$var::OUTPUT_DIR = $output_dir;
require_modules();
change_filenames();
check_present_index();
# if Checkpoint mode, return
return (0, 0) if $CheckPoint{'continue'};
check_lockfile($var::NMZ{'lock2'});
print _("Looking for indexing files...\n");
@flist = find_target(@targets);
($docid_base, @flist) = append_index(@flist)
if -f $var::NMZ{'r'};
unless (@flist) { # if @flist is empty
print _("No files to index.\n");
exit 0;
}
set_lockfile($var::NMZ{'lock2'});
save_flist(@flist);
my $total_files_num = @flist;
return ($docid_base, $total_files_num);
}
sub save_flist(@) {
my @flist = @_;
return if (@flist == 0);
my $fh_flist = util::efopen(">$var::NMZ{'_flist'}");
print $fh_flist join("\n", @flist), "\n";
}
sub require_modules() {
if (util::islang("ja") && $conf::NKF =~ /^module_nkf/) {
require NKF || die "unable to require \"NKF\"\n";
util::dprint(_("code conversion: using NKF module\n"));
$var::USE_NKF_MODULE = 1;
}
if (util::islang("ja") && $conf::WAKATI =~ /^module_kakasi/) {
require Text::Kakasi || die "unable to require \"Text::Kakasi\"\n";
util::dprint(_("wakati: using Text::Kakasi module\n"));
my $res = Text::Kakasi::getopt_argv('kakasi', '-ieuc', '-oeuc', '-w');
}
if (util::islang("ja") && $conf::WAKATI =~ /^module_chasen/) {
require Text::ChaSen || die "unable to require \"Text::ChaSen\"\n";
util::dprint(_("wakati: using Text::ChaSen module\n"));
my @arg = ('-j', '-F', '%m ');
@arg = ('-j', '-F', '%m %H\\n') if $var::Opt{'noun'};
my $res = Text::ChaSen::getopt_argv('chasen-perl', @arg);
}
}
sub check_lockfile ($) {
# warn if check file exists in case other process is running or abnormal
# stop execution (later is not the major purpose, though).
# This is mainly for early detection before longish find_target.
my ($file) = @_;
if (-f $file) {
print "$file "._("found. Maybe this index is being updated by another process now.\nIf not, you can remove this file.\n");
exit 1;
}
}
sub set_lockfile ($) {
my ($file) = @_;
# make a lock file
if (-f $file) {
print "$file found. Maybe this index is being updated by another process now.\nIf not, you can remove this file.\n";
exit 1;
} else {
my $fh_lockfile = util::efopen(">$file");
print $fh_lockfile "$$"; # save pid
}
}
sub remove_lockfile ($) {
my ($file) = @_;
# remove lock file
unlink $file if -f $file;
}
# check present index whether it is old type of not
sub check_present_index () {
if (-f $var::NMZ{'i'} && ! -f "$var::NMZ{'result'}.normal")
{
util::cdie(_("Present index is old type. it's unsupported.\n"));
}
}
# remain
sub do_remain_job ($$$$) {
my ($total_files_size, $docid_count, $key_count, $start_time) = @_;
if ($docid_count == 0) {
# No files are indexed
if ($DeletedFilesCount > 0) {
update_dateindex();
update_registry($docid_count);
}
} else {
set_lockfile($var::NMZ{'lock'});
write_version();
write_body_msg();
write_tips_msg();
write_result_file();
update_field_index();
update_dateindex();
update_registry($docid_count);
write_nmz_files();
make_slog_file();
remove_lockfile($var::NMZ{'lock'});
}
make_headfoot_pages($docid_count, $key_count);
put_log($total_files_size, $start_time, $docid_count, $key_count);
util::remove_tmpfiles();
unlink $var::NMZ{'_flist'};
}
sub make_headfoot_pages($$) {
my ($docid_count, $key_count) = @_;
for my $file (glob "$TEMPLATEDIR/NMZ.head*") {
$file =~ m!.*/NMZ.head(.*)$!;
my $suffix = $1;
make_headfoot("$var::NMZ{'head'}${suffix}", $docid_count, $key_count);
}
for my $file (glob "$TEMPLATEDIR/NMZ.foot*") {
$file =~ m!.*/NMZ.foot(.*)$!;
my $suffix = $1;
make_headfoot("$var::NMZ{'foot'}${suffix}", $docid_count, $key_count);
}
}
# Parse command line options.
sub parse_options
{
if (@ARGV == 0) {
show_mini_usage();
exit 1;
}
my @targets = ();
my $targets_loaded = 0;
my @argv = @ARGV;
my $cwd = cwd();
my $opt_dummy = 0;
my $opt_version = 0;
my $opt_help = 0;
my $opt_all = 0;
my $opt_chasen = 0;
my $opt_chasen_noun = 0;
my $opt_kakasi = 0;
my $opt_checkpoint_sub = 0;
my $opt_show_config = 0;
my $opt_mailnews = 0;
my $opt_mhonarc = 0;
my $opt_quiet = undef;
my $opt_config = undef;
my $output_dir = undef;
my $update_index = undef;
my $include_file = undef;
my $target_list = undef;
my $index_lang = undef;
# Getopt::Long::Configure('bundling');
Getopt::Long::config('bundling');
GetOptions(
'0|help' => \$opt_help,
'1|exclude=s' => \$conf::EXCLUDE_PATH,
'2|deny=s' => \$conf::DENY_FILE,
'3|allow=s' => \$conf::ALLOW_FILE,
'4|update=s' => \$update_index,
'5|mhonarc' => \$opt_mhonarc,
'6|mtime=s' => \$var::Opt{'mtime'},
'7|html-split' => \$var::Opt{'htmlsplit'},
'C|show-config' => \$opt_show_config,
'E|no-edge-symbol' => \$var::Opt{'noedgesymbol'},
'F|target-list=s' => \$target_list,
'G|no-okurigana' => \$var::Opt{'okurigana'},
'H|no-hiragana' => \$var::Opt{'hiragana'},
'I|include=s' => \$include_file,
'K|no-symbol' => \$var::Opt{'nosymbol'},
'L|indexing-lang=s' => \$index_lang,
'M|meta' => \$var::Opt{'meta'},
'O|output-dir=s' => \$output_dir,
'S|checkpoint-sub' => \$opt_checkpoint_sub,
'T|template-dir=s' => \$TEMPLATEDIR,
'U|no-encode-uri' => \$var::Opt{'noencodeuri'} ,
'V|verbose' => \$var::Opt{'verbose'},
'Y|no-delete' => \$var::Opt{'nodelete'},
'Z|no-update' => \$var::Opt{'noupdate'},
'a|all' => \$opt_all,
'c|use-chasen' => \$opt_chasen,
'd|debug' => \$var::Opt{'debug'},
'e|robots' => \$var::Opt{'robotexclude'},
'f|config=s' => \$opt_config,
'h|mailnews' => \$opt_mailnews,
'k|use-kakasi' => \$opt_kakasi,
'm|use-chasen-noun' => \$opt_chasen_noun,
'q|quiet' => \$opt_quiet,
'r|replace=s' => \$ReplaceCode,
's|checkpoint' => \$CheckPoint{'on'},
't|media-type=s' => \$MediaType,
'u|uuencode' => \$opt_dummy, # for backward compat.
'v|version' => \$opt_version,
'x|no-heading-summary'=> \$var::Opt{'noheadabst'},
);
if ($opt_quiet) {
# Make STDOUT quiet by redirecting STDOUT to null device.
my $devnull = util::devnull();
open(STDOUT, ">$devnull") || die "$devnull: $!";
}
if ($opt_config) {
load_rcfile($ConfigFile = $opt_config);
}
load_filtermodules(); # to make effect $opt_config.
postload_modules();
if ($index_lang) {
$util::LANG = $index_lang;
util::dprint("Override indexing language: $util::LANG\n");
}
if ($opt_help) {
show_usage();
exit 1;
}
if ($opt_version) {
show_version();
exit 1;
}
if ($opt_show_config) {
show_config();
exit 1;
}
if (defined $update_index) {
unless (-d $update_index) {
print _("No such index: "), "$update_index\n";
exit 1;
}
my $orig_status = $var::NMZ{'status'};
$var::NMZ{'status'} = "$update_index/$var::NMZ{'status'}";
my $argv = get_status("argv");
@ARGV = split /\t/, $argv;
util::dprint(_("Inherited argv: ")."@ARGV\n");
my $cwd = get_status("cwd");
chdir $cwd;
util::dprint(_("Inherited cwd: ")."$cwd\n");
($output_dir, @targets) = parse_options();
$output_dir = $update_index;
$var::NMZ{'status'} = $orig_status; # See also change_filenames()
return ($output_dir, @targets);
}
if ($opt_mailnews) {
$MediaType = 'message/rfc822';
}
if ($opt_mhonarc) {
$MediaType = 'text/html; x-type=mhonarc';
}
if ($opt_all) {
$conf::ALLOW_FILE = ".*";
}
if ($opt_chasen) {
$conf::WAKATI = $conf::CHASEN;
$var::Opt{'noun'} = 0;
}
if ($opt_chasen_noun) {
$conf::WAKATI = $conf::CHASEN_NOUN;
$var::Opt{'noun'} = 1;
}
if ($opt_kakasi) {
$conf::WAKATI = $conf::KAKASI;
$var::Opt{'noun'} = 0;
}
if ($include_file) {
do $include_file;
util::dprint("Included: $include_file\n");
}
if ($target_list) {
if ($CheckPoint{'continue'}) {
@targets = ("dummy");
} else {
@targets = load_target_list($target_list);
util::dprint(_("Loaded: ")."$target_list\n");
}
$targets_loaded = 1;
}
if ($opt_checkpoint_sub) {
$CheckPoint{'on'} = 1;
$CheckPoint{'continue'} = 1;
@argv = grep {! /^-S$/} @argv; # remove -S
}
if (defined $ReplaceCode) {
my $orig = "/foo/bar/baz/quux.html";
$_ = $orig;
eval $ReplaceCode;
if ($@) { # eval error
util::cdie(_("Invalid replace: ")."$ReplaceCode\n");
}
util::dprint(_("Replace: ")."$orig -> $_\n");
}
if (@ARGV == 0 && $targets_loaded == 0) {
show_mini_usage();
exit 1;
}
$output_dir = $cwd unless defined $output_dir;
util::cdie("$output_dir: "._("invalid output directory\n"))
unless (-d $output_dir && -w $output_dir);
if ($SYSTEM eq "MSWin32" || $SYSTEM eq "os2") {
# convert \ to / with consideration for Shift_JIS Kanji code
$output_dir =~
s!([\x81-\x9f\xe0-\xef][\x40-\x7e\x80-\xfc]|[\x01-\x7f])!
$1 eq "\\" ? "/" : $1!gex;
}
# take remaining @ARGV as targets
if (@ARGV > 0 && $targets_loaded == 0) {
@targets = @ARGV ;
}
# revert @ARGV
# unshift @ARGV, splice(@argv, 0, @argv - @ARGV);
@ARGV = @argv;
return ($output_dir, @targets);
}
sub show_config () {
print _("Loaded rcfile: ") . "@LoadedRcfiles\n" if @LoadedRcfiles;
print _("System: ") . "$SYSTEM\n" if $SYSTEM;
print _("Namazu: ") . "$var::VERSION\n" if $var::VERSION;
print _("Perl: ") . "$]\n" if $]; # '$]' has a perl version
print _("NKF: ") . "$conf::NKF\n" if $conf::NKF;
print _("KAKASI: ") . "$conf::KAKASI\n" if $conf::KAKASI;
print _("ChaSen: ") . "$conf::CHASEN\n" if $conf::CHASEN;
print _("Wakati: ") . "$conf::WAKATI\n" if $conf::WAKATI;
print _("Lang_Msg: ") . "$util::LANG_MSG\n";
print _("Lang: ") . "$util::LANG\n";
print _("Coding System: ") . "$CodingSystem\n";
print _("CONFDIR: ") . "$CONFDIR\n";
print _("LIBDIR: ") . "$LIBDIR\n";
print _("FILTERDIR: ") . "$FILTERDIR\n";
print _("TEMPLATEDIR: ") . "$TEMPLATEDIR\n";
my @supported = sort grep { $var::Supported{$_} eq "yes" }
keys %var::Supported;
print _("Supported media types: \n");
for my $mtype (@supported) {
print " $mtype\n";
}
}
sub load_target_list ($) {
my ($file) = @_;
my $fh_targets = util::efopen($file);
my @targets = <$fh_targets>;
if (($SYSTEM eq "MSWin32") || ($SYSTEM eq "os2")) {
grep {
s/\r//g;
# Replace \ with / with consideration for Shift_JIS.
s!([\x81-\x9f\xe0-\xef][\x40-\x7e\x80-\xfc]|[\x01-\x7f])!
$1 eq "\\" ? "/" : $1!gex;
} @targets;
}
chomp @targets;
return @targets;
}
# convert a relative path into an absolute path
sub absolute_path($$) {
my ($cwd, $path) = @_;
$path =~ s!^\.$!\./!;
$path =~ s!^\.[/\\]!$cwd/!;
if (($SYSTEM eq "MSWin32") || ($SYSTEM eq "os2")) {
$path =~ s!([\x81-\x9f\xe0-\xef][\x40-\x7e\x80-\xfc]|[\x01-\x7f])!
$1 eq "\\" ? "/" : $1!gex;
$path =~ s,^([A-Z](?!:/)),$cwd/$1,i;
} else {
$path =~ s!^([^/])!$cwd/$1!;
}
return $path;
}
sub find_target (@) {
my @targets = @_;
my $cwd = cwd();
@targets = map { absolute_path($cwd, $_) } @targets;
# Convert \ to / with consideration for Shift_JIS encoding.
if (($SYSTEM eq "MSWin32") || ($SYSTEM eq "os2")) {
grep {
$_ =~ s!([\x81-\x9f\xe0-\xef][\x40-\x7e\x80-\xfc]|[\x01-\x7f])!
$1 eq "\\" ? "/" : $1!gex;
} @targets;
}
# For reporting effects of --allow, --deny, --exclude, --mtime
# options in --verbose mode.
my %counts = ();
$counts{'possible'} = 0;
$counts{'excluded'} = 0;
$counts{'too_old'} = 0;
$counts{'too_new'} = 0;
$counts{'not_allowed'} = 0;
$counts{'denied'} = 0;
# Traverse directories.
# This routine is not efficent but I prefer reliable logic.
my @flist = ();
my $start = time();
util::vprint(_("find_target starting: "). localtime($start). "\n");
while (@targets) {
my $target = shift @targets;
if ($target eq '') {
print STDERR "Warning: target contains empty line, skip it\n";
next;
}
if (-f $target) { # target is a file.
add_target($target, \@flist, \%counts);
} elsif (-d $target) { # target is a directory.
my @subtargets = ();
# Find subdirectories in target directory
# because File::Find::find() does not follow symlink.
if (-l $target) {
my $dh = new DirHandle($target);
while (defined(my $ent = $dh->read)) {
next if ($ent =~ /^\.{1,2}$/);
my $fname = "$target/$ent";
next if ($fname eq '.' || $fname eq '..');
if (-d $fname) {
push(@subtargets, $fname);
} else {
add_target($fname, \@flist, \%counts);
}
}
} else {
@subtargets = ($target);
}
#
# Wanted routine for File::Find's find().
#
my $wanted_closure = sub {
my $fname = "$File::Find::dir/$_";
add_target($fname, \@flist, \%counts);
};
find($wanted_closure, @subtargets) if (@subtargets > 0);
} else {
print STDERR _("unsupported target: ") . $target;
}
}
# uniq @flist
my %mark = ();
@flist = grep {$mark{$_}++; $mark{$_} == 1} @flist;
# Sort file names with consideration for numbers.
@flist = map { $_->[0] }
sort { $a->[1] cmp $b->[1] }
map { my $tmp = $_; $tmp =~ s/(\d+)/sprintf("%08d", $1)/ge;
[ $_, $tmp ] } @flist;
my $elapsed = time() - $start ;
$elapsed += 1 ; # to round up and avoid 0
# For --verbose option.
report_find_target($elapsed, $#flist + 1, %counts);
return @flist;
}
sub add_target ($\@\%) {
my ($target, $flists_ref, $counts_ref) = @_;
if ($target =~ /[\n\r\t]/) {
$target =~ s/[\n\r\t]//g;
print STDERR "Warning: $target contains LF/CR/TAB chars, skip it\n";
return; # skip a file name containing LF/CR/TAB chars.
}
return unless -f $target; # Only file is targeted.
$counts_ref->{'possible'}++;
unless (-r $target) {
util::vprint(sprintf(_("Unreadable: %s"), $target));
$counts_ref->{'excluded'}++;
return;
}
if (defined $conf::EXCLUDE_PATH &&
$target =~ /$conf::EXCLUDE_PATH/ )
{
util::vprint(sprintf(_("Excluded: %s"), $target));
$counts_ref->{'excluded'}++;
return;
}
#
# Do processing just like find's --mtime option.
#
if (defined $var::Opt{'mtime'}) {
my $mtime = -M $_;
if ($var::Opt{'mtime'} < 0) {
# This must be `>=' not `>' for consistency with find(1).
if (int($mtime) >= - $var::Opt{'mtime'}) {
util::vprint(sprintf(_("Too old: %s"), $target));
$counts_ref->{'too_old'}++;
return;
}
} elsif ($var::Opt{'mtime'} > 0) {
if ($var::Opt{'mtime'} =~ /^\+/) {
if ((int($mtime) < $var::Opt{'mtime'})) {
util::vprint(sprintf(_("Too new: %s"), $target));
$counts_ref->{'too_new'}++;
return;
}
} else {
if (int($mtime) != $var::Opt{'mtime'}) {
if (int($mtime) > $var::Opt{'mtime'}) {
util::vprint(sprintf(_("Too old: %s"),$target));
$counts_ref->{'too_old'}++;
} else {
util::vprint(sprintf(_("Too new: %s"),$target));
$counts_ref->{'too_new'}++;
}
return;
}
}
} else {
# $var::Opt{'mtime'} == 0 ;
return;
}
}
# Extract the file name of the target.
$target =~ m!^.*/([^/]+)$!;
my $fname = $1;
if ($fname =~ m!^($conf::DENY_FILE)$!i ) {
util::vprint(sprintf(_("Denied: %s"), $target));
$counts_ref->{'denied'}++;
return;
}
if ($fname !~ m!^($conf::ALLOW_FILE)$!i) {
util::vprint(sprintf(_("Not allowed: %s"), $target));
$counts_ref->{'not_allowed'}++;
return;
} else{
util::vprint(sprintf(_("Targeted: %s"), $target));
push @$flists_ref, $target;
}
}
sub report_find_target ($$%) {
my ($elapsed, $num_targeted, %counts) = @_;
util::vprint(_("find_target finished: ") . localtime(time()). "\n");
util::vprint(sprintf(_("Target Files: %d (Scan Performance: Elapsed Sec.: %d, Files/sec: %.1f)"),
$num_targeted, $elapsed,
$num_targeted /$elapsed));
util::vprint(sprintf(_(" Possible: %d, Not allowed: %d, Denied: %d, Excluded: %d"),
$counts{'possible'},
$counts{'not_allowed'},
$counts{'denied'},
$counts{'excluded'}));
util::vprint(sprintf(_(" MTIME too old: %d, MTIME too new: %d"),
$counts{'too_old'},
$counts{'too_new'}));
}
sub show_usage () {
util::dprint(_("lang_msg: ")."$util::LANG_MSG\n");
util::dprint(_("lang: ")."$util::LANG\n");
# To know why we should do this, see usage.pl.
my $usage = $usage::USAGE;
$usage =~ s/\n\n/\n/g;
$usage = _($usage);
printf "$usage", $var::VERSION, $var::MAILING_ADDRESS;
}
sub show_mini_usage () {
print _("Usage: mknmz [options] <target>...\n");
print _("Try `mknmz --help' for more information.\n");
}
sub show_version () {
print $usage::VERSION_INFO;
}
#
# check the file. No $msg is good.
#
sub check_file ($$$$$) {
my ($cfile, $cfile_size, $text_size, $mtype, $uri) = @_;
my $msg = undef;
if (! -e $cfile) {
$msg = _("does NOT EXIST! skipped.");
} elsif (! -r $cfile) {
$msg = _("is NOT READABLE! skipped.");
} elsif ($text_size == 0 || $cfile_size == 0) {
$msg = _("is 0 size! skipped.");
} elsif ($mtype =~ /^application\/octet-stream/) {
$msg = _("may be a BINARY file! skipped.");
} elsif ($text_size > $conf::TEXT_SIZE_MAX) {
$msg = _("is too LARGE a text! skipped.");
} elsif ($mtype =~ /; x-system=unsupported$/) {
$mtype =~ s/; x-system=unsupported$//;
$msg = _("Unsupported media type ")."($mtype)"._(" skipped.");
} elsif ($mtype =~ /; x-error=.*$/) {
$mtype =~ s/^.*; x-error=(.*)$/$1/;
$msg = $mtype;
} elsif ($mtype =~ /^x-system/) {
$msg = _("system error occurred! ")."($mtype)".(" skipped.");
}
return $msg;
}
#
# Write NMZ.version file.
#
sub write_version() {
unless (-f $var::NMZ{'version'}) {
my $fh = util::efopen(">$var::NMZ{'version'}");
print $fh "Namazu-Index-Version: $NAMAZU_INDEX_VERSION\n";
}
}
#
# rename each temporary file to a real file name.
#
sub write_nmz_files () {
util::Rename($var::NMZ{'_i'}, $var::NMZ{'i'});
util::Rename($var::NMZ{'_ii'}, $var::NMZ{'ii'});
util::Rename($var::NMZ{'_w'}, $var::NMZ{'w'});
util::Rename($var::NMZ{'_wi'}, $var::NMZ{'wi'});
util::Rename($var::NMZ{'_p'}, $var::NMZ{'p'});
util::Rename($var::NMZ{'_pi'}, $var::NMZ{'pi'});
}
# output NMZ.body
sub write_body_msg () {
for my $file (glob "$TEMPLATEDIR/NMZ.body*") {
$file =~ m!.*/NMZ.body(.*)$!;
my $suffix = $1;
write_message("$var::NMZ{'body'}${suffix}");
}
}
# output NMZ.tips
sub write_tips_msg () {
for my $file (glob "$TEMPLATEDIR/NMZ.tips*") {
$file =~ m!.*/NMZ.tips(.*)$!;
my $suffix = $1;
write_message("$var::NMZ{'tips'}${suffix}");
}
}
# output NMZ.result.*
sub write_result_file () {
my $fname = "NMZ.result.normal";
my @files = glob "$TEMPLATEDIR/NMZ.result.*";
for my $file (@files) {
$file =~ m!(NMZ\.result\.[^/]*)$!;
my $target = "$var::OUTPUT_DIR/$1";
if (-f $target) { # already exist;
next;
} else {
my $buf = util::readfile($file);
my $fh_file = util::efopen(">$target");
print $fh_file $buf;
}
}
}
# write NMZ.body and etc.
sub write_message ($) {
my ($msgfile) = @_;
if (! -f $msgfile) {
my ($template, $fname);
$msgfile =~ m!.*/(.*)$!;
$fname = $1;
$template = "$TEMPLATEDIR/$fname";
if (-f $template) {
my $buf = util::readfile($template);
my $fh_output = util::efopen(">$msgfile");
print $fh_output $buf;
}
}
}
#
# Make the NMZ.slog file for logging.
#
sub make_slog_file () {
{
my $fh_slogfile = util::efopen(">>$var::NMZ{'slog'}");
}
chmod 0666, $var::NMZ{'slog'};
}
#
# Concatenate $CURRENTDIR to the head of each file.
#
sub change_filenames ($) {
my $dir = $var::OUTPUT_DIR;
for my $key (sort keys %var::NMZ) {
next if $key =~ /^_/; # exclude temporary file
$var::NMZ{$key} = "$dir/$var::NMZ{$key}";
}
# temporary files
for my $key (sort keys %var::NMZ) {
if ($key =~ /^_/) {
$var::NMZ{$key} = util::tmpnam($var::NMZ{$key});
}
}
if ($var::Opt{'debug'}) {
for my $key (sort keys %var::NMZ) {
util::dprint("NMZ: $var::NMZ{$key}\n");
}
}
}
#
# Preparation processing for appending index files.
#
sub append_index (@) {
my @flist = @_;
my $docid_base = 0;
($docid_base, @flist) = set_target_files(@flist);
unless (@flist) { # if @flist is empty
if ($DeletedFilesCount > 0) {
make_headfoot_pages(0, 0);
set_lockfile($var::NMZ{'lock2'});
update_dateindex();
update_registry(0);
put_log(0, 0, 0, get_total_keys());
util::remove_tmpfiles();
}
print _("No files to index.\n");
exit 0;
}
$APPENDMODE = 1;
# conserve files by copying
copy($var::NMZ{'i'}, $var::NMZ{'_i'});
copy($var::NMZ{'w'}, $var::NMZ{'_w'});
copy($var::NMZ{'t'}, $var::NMZ{'_t'})
unless -f $var::NMZ{'_t'}; # preupdated ?
copy($var::NMZ{'p'}, $var::NMZ{'_p'});
copy($var::NMZ{'pi'}, $var::NMZ{'_pi'});
return ($docid_base, @flist);
}
#
# Set target files to @flist and return with the number of regiested files.
#
sub set_target_files() {
my %rdocs; # 'rdocs' means 'registered documents'
my @found_files = @_;
# Load the list of registered documents
$rdocs{'name'} = load_registry();
# Pick up overlapped documents and do marking
my %mark1;
my @overlapped_files;
grep {$_ !~ /^\# / && $mark1{$_}++ } @{$rdocs{'name'}};
$rdocs{'overlapped'} = {}; # Prepare an anonymous hash.
for my $overlapped (grep { $mark1{$_} } @found_files) {
$rdocs{'overlapped'}{$overlapped} = 1;
push @overlapped_files, $overlapped;
};
# Pick up not overlapped documents which are files to index.
my @flist = grep { ! $mark1{$_} } @found_files;
if ($var::Opt{'noupdate'}) {
return (scalar @{$rdocs{'name'}}, @flist);
};
# Load the date index.
$rdocs{'mtime'} = load_dateindex();
if (@{$rdocs{'mtime'}} == 0) {
return (scalar @{$rdocs{'name'}}, @flist);
};
util::assert(@{$rdocs{'name'}} == @{$rdocs{'mtime'}},
"NMZ.r ($#{$rdocs{'name'}}) and NMZ.t ($#{$rdocs{'mtime'}}) are not consistent!");
# Pick up deleted documents and do marking
# (registered in the NMZ.r but not existent in the filesystem)
my @deleted_documents;
unless ($var::Opt{'nodelete'}) {
my %mark2;
grep { $mark2{$_}++ } @found_files;
for my $deleted (grep { $_ !~ /^\# / && ! $mark2{$_} &&
! $rdocs{'overlapped'}{$_} }
@{$rdocs{'name'}})
{
$rdocs{'deleted'}{$deleted} = 1;
push @deleted_documents, $deleted;
}
}
# Pick up updated documents and set the missing number for deleted files.
my @updated_documents = pickup_updated_documents(\%rdocs);
# Append updated files to the list of files to index.
if (@updated_documents) {
push @flist, @updated_documents;
}
# Remove duplicates.
my %seen = ();
@flist = grep { ! $seen{$_}++ } @flist;
util::dprint(_("\n\n== found files ==\n"), join("\n", @found_files), "\n");
util::dprint(_("\n\n== registered documents ==\n"), join("\n", @{$rdocs{'name'}}), "\n");
util::dprint(_("\n\n== overlapped documents ==\n"), join("\n", @overlapped_files), "\n");
util::dprint(_("\n\n== deleted documents ==\n"), join("\n", @deleted_documents), "\n");
util::dprint(_("\n\n== updated documents ==\n"), join("\n", @updated_documents), "\n");
util::dprint(_("\n\n== files to index ==\n"), join("\n", @flist), "\n");
# Update NMZ.t with the missing number infomation and
# append updated files and deleted files to NMZ.r with leading '# '
if (@updated_documents || @deleted_documents) {
$DeletedFilesCount = 0;
$UpdatedFilesCount = 0;
$UpdatedFilesCount += @updated_documents;
# $DeletedFilesCount += @updated_documents;
$DeletedFilesCount += @deleted_documents;
preupdate_dateindex(@{$rdocs{'mtime'}});
preupdate_registry(@updated_documents, @deleted_documents);
}
# Return the number of registered documents and list of files to index.
return (scalar @{$rdocs{'name'}}, @flist);
}
sub preupdate_registry(@) {
my (@list) = @_;
my $fh_registry = util::efopen(">$var::NMZ{'_r'}");
@list = grep { s/(.*)/\# $1\n/ } @list;
print $fh_registry @list;
print $fh_registry &_("## deleted: ") . util::rfc822time(time()) . "\n\n";
}
sub preupdate_dateindex(@) {
my @mtimes = @_;
# Since rewriting the entire file, it is not efficient,
# but simple and reliable. this would be revised in the future.
my $fh_dateindex = util::efopen(">$var::NMZ{'_t'}");
# print "\nupdate_dateindex\n", join("\n", @mtimes), "\n\n";
print $fh_dateindex pack("N*", @mtimes);
}
sub update_registry ($) {
my ($docid_count) = @_;
{
my $fh_registry = util::efopen(">>$var::NMZ{'r'}");
my $fh_registry_ = util::efopen($var::NMZ{'_r'});
while (defined(my $line = <$fh_registry_>)) {
print $fh_registry $line;
}
if ($docid_count > 0) {
print $fh_registry &_("## indexed: ") . util::rfc822time(time()) . "\n\n";
}
}
unlink $var::NMZ{'_r'};
}
sub update_dateindex () {
util::Rename($var::NMZ{'_t'}, $var::NMZ{'t'});
}
sub update_field_index () {
my @list = glob "$var::NMZ{'field'}.*.tmp";
for my $tmp (@list) {
if ($tmp =~ m!((^.*/NMZ\.field\.[^\.]+(?:\.i)?)\.tmp)!) {
my $fname_tmp = $1;
my $fname_out = $2;
{
my $fh_field = util::efopen(">>$fname_out");
my $fh_tmp = util::efopen($fname_tmp);
while (defined(my $line = <$fh_tmp>)) {
print $fh_field $line;
}
}
unlink $fname_tmp;
} else {
util::cdie(_("update_field_index: ")."@list");
}
}
}
sub pickup_updated_documents (\%) {
my ($rdocs_ref) = @_;
my @updated_documents = ();
# To avoid duplicated outputs caused by --html-split support.
my %printed = ();
my $i = 0;
for my $cfile (@{$rdocs_ref->{'name'}}) {
if (defined($rdocs_ref->{'deleted'}{$cfile})) {
unless ($printed{$cfile}) {
print "$cfile " . _("was deleted!\n");
$printed{$cfile} = 1;
}
$rdocs_ref->{'mtime'}[$i] = -1; # Assign the missing number.
} elsif (defined($rdocs_ref->{'overlapped'}{$cfile})) {
my $cfile_mtime = (stat($cfile))[9];
my $rfile_mtime = $rdocs_ref->{'mtime'}[$i];
if ($rfile_mtime != $cfile_mtime) {
# The file is updated!
unless ($printed{$cfile}) {
print "$cfile " . _("was updated!\n");
$printed{$cfile} = 1;
}
push(@updated_documents, $cfile);
$rdocs_ref->{'mtime'}[$i] = -1; # Assign the missing number.
}
}
$i++;
}
return @updated_documents
}
sub load_dateindex() {
my $fh_dateindex = util::efopen($var::NMZ{'t'});
my $size = -s $var::NMZ{'t'};
my $buf = "";
read($fh_dateindex, $buf, $size);
my @list = unpack("N*", $buf); # load date index
# print "\nload_dateindex\n", join("\n", @list), "\n\n";
return [ @list ];
}
sub load_registry () {
my $fh_registry = util::efopen($var::NMZ{'r'});
my @list = ();
my %deleted = ();
my @registered = ();
while (defined(my $line = <$fh_registry>)) {
chomp($line);
next if $line =~ /^\s*$/; # an empty line
next if $line =~ /^##/; # a comment
if ($line =~ s/^\#\s+//) { # deleted document
$deleted{$line}++;
} else {
# Remove HTML's anchor generated by --html-split option.
$line =~ s/\t.*$//g;
push @registered, $line;
}
}
# Exclude deleted documents.
for my $doc (@registered) {
if ($deleted{$doc}) {
push @list, "# $doc";
$deleted{$doc}--;
} else {
push @list, $doc;
}
}
return [ @list ];
}
sub get_total_keys() {
my $keys = get_status("keys");
$keys = 0 unless defined $keys;
return $keys;
}
sub get_total_files() {
my $files = get_status("files");
$files = 0 unless defined $files;
return $files;
}
sub get_status($) {
my ($key) = @_;
my $fh = util::fopen($var::NMZ{'status'});
return undef unless defined $fh;
while (defined(my $line = <$fh>)) {
if ($line =~ /^$key\s+(.*)$/) {
util::dprint("status: $key = $1\n");
$fh->close;
return $1;
}
}
return undef;
}
sub put_total_files($) {
my ($number) = @_;
$number =~ tr/,//d;
put_status("files", $number);
}
sub put_total_keys($) {
my ($number) = @_;
$number =~ tr/,//d;
put_status("keys", $number);
}
sub put_status($$) {
my ($key, $value) = @_;
# remove NMZ.status file if the file has a previous value.
unlink $var::NMZ{'status'} if defined get_status($key);
my $fh = util::efopen(">> $var::NMZ{'status'}");
print $fh "$key $value\n";
}
# do logging
sub put_log ($$$$) {
my ($total_files_size, $start_time, $docid_count, $total_keys_count) = @_;
my $date = localtime;
my $added_files_count = $docid_count - $UpdatedFilesCount;
my $deleted_documents_count = $DeletedFilesCount;
my $updated_documents_count = $UpdatedFilesCount;
my $total_files_count = get_total_files() + $docid_count
- $DeletedFilesCount - $UpdatedFilesCount;
my $added_keys_count = 0;
$added_keys_count = $total_keys_count - get_total_keys();
my $processtime = time - $start_time;
$processtime = 0 if $start_time == 0;
$total_files_size = $total_files_size;
$total_keys_count = $total_keys_count;
my @logmsgs = ();
if ($APPENDMODE) {
push @logmsgs, N_("[Append]");
} else {
push @logmsgs, N_("[Base]");
}
push @logmsgs, N_("Date:"), "$date" if $date;
push @logmsgs, N_("Added Documents:"), util::commas("$added_files_count")
if $added_files_count;
push @logmsgs, N_("Deleted Documents:"),
util::commas("$deleted_documents_count") if $deleted_documents_count;
push @logmsgs, N_("Updated Documents:"),
util::commas("$updated_documents_count") if $updated_documents_count;
push @logmsgs, N_("Size (bytes):"), util::commas("$total_files_size")
if $total_files_size;
push @logmsgs, N_("Total Documents:"), util::commas("$total_files_count")
if $total_files_count;
push @logmsgs, N_("Added Keywords:"), util::commas("$added_keys_count")
if $added_keys_count;
push @logmsgs, N_("Total Keywords:"), util::commas("$total_keys_count")
if $total_keys_count;
push @logmsgs, N_("Wakati:"), "$conf::WAKATI" if $conf::WAKATI;
push @logmsgs, N_("Time (sec):"), util::commas("$processtime")
if $processtime;
push @logmsgs, N_("File/Sec:"), sprintf "%.2f",
(($added_files_count + $updated_documents_count) / $processtime)
if $processtime;
push @logmsgs, N_("System:"), "$SYSTEM" if $SYSTEM;
push @logmsgs, N_("Perl:"), "$]" if $]; # '$]' has a perl version
push @logmsgs, N_("Namazu:"), "$var::VERSION" if $var::VERSION;
my $log_for_file = "";
my $msg = shift @logmsgs; # [Base] or [Append]
# To stdout, use gettext.
print _($msg), "\n";
# To log file, do not use gettext.
$log_for_file = $msg . "\n";
while (@logmsgs) {
my $field = shift @logmsgs;
my $value = shift @logmsgs;
printf "%-20s %s\n", _($field), "$value";
$log_for_file .= sprintf "%-20s %s\n", $field, "$value";
}
print "\n";
$log_for_file .= "\n";
put_log_to_logfile($log_for_file);
put_total_files($total_files_count);
put_total_keys($total_keys_count);
my $argv = join "\t", @ARGV;
my $cwd = cwd();
put_status("argv", $argv);
put_status("cwd", $cwd);
}
sub put_log_to_logfile ($) {
my ($logmsg) = @_;
my $fh_logfile = util::efopen(">>$var::NMZ{'log'}");
print $fh_logfile $logmsg;
}
sub get_year() {
my $year = (localtime)[5] + 1900;
return $year;
}
# Compose NMZ.head and NMZ.foot. Prepare samples if necessary.
# Insert $docid_count, $key_count, and $month/$day/$year respectively.
sub make_headfoot ($$$) {
my ($file, $docid_count, $key_count) = @_;
my $day = sprintf("%02d", (localtime)[3]);
my $month = sprintf("%02d", (localtime)[4] + 1);
my $year = get_year();
my $buf = "";
if (-f $file) {
$buf = util::readfile($file);
} else {
$file =~ m!.*/(.*)$!;
my $fname = $1;
my $template = "$TEMPLATEDIR/$fname";
if (-f $template) {
$buf = util::readfile($template);
} else {
return;
}
}
my $fh_file = util::efopen(">$file");
if ($buf =~ /(<!-- FILE -->)\s*(.*)\s*(<!-- FILE -->)/) {
my $total_files_count = util::commas(get_total_files() + $docid_count
- $DeletedFilesCount - $UpdatedFilesCount);
$buf =~ s/(<!-- FILE -->)(.*)(<!-- FILE -->)/$1 $total_files_count $3/;
}
if ($buf =~ /(<!-- KEY -->)\s*(.*)\s*(<!-- KEY -->)/) {
my $tmp = $2;
$tmp =~ tr/,//d;
$tmp = $key_count;
$tmp = util::commas($tmp);
$buf =~ s/(<!-- KEY -->)(.*)(<!-- KEY -->)/$1 $tmp $3/;
}
$buf =~ s#(<!-- DATE -->)(.*)(<!-- DATE -->)#$1 $year-$month-$day $3#gs;
$buf =~ s/(<!-- VERSION -->)(.*)(<!-- VERSION -->)/$1 v$var::VERSION $3/gs;
$buf =~ s{(<!-- ADDRESS -->)(.*)(<!-- ADDRESS -->)}
{$1\n<a href="mailto:$conf::ADDRESS">$conf::ADDRESS</a>\n$3}gs;
$buf =~ s{(<!-- LINK-REV-MADE -->)(.*)(<!-- LINK-REV-MADE -->)}
{$1\n<link rev=made href="mailto:$conf::ADDRESS">\n$3}gs;
print $fh_file $buf;
}
# Make phrase hashes for NMZ.p
# Handle two words each for calculating a hash value ranged 0-65535.
sub make_phrase_hash ($$$) {
my ($docid_count, $docid_base, $contref) = @_;
my %tmp = ();
$$contref =~ s!\x7f */? *\d+ *\x7f!!g; # remove tags of weight
$$contref =~ tr/\xa1-\xfea-z0-9 \n//cd; # remove all symbols
my @words = split(/\s+/, $$contref);
@words = grep {$_ ne ""} @words; # remove empty words
my $word_b = shift @words;
my $docid = $docid_count + $docid_base;
for my $word (@words) {
my $hash = hash($word_b . $word);
unless (defined $tmp{$hash}) {
$tmp{$hash} = 1;
$PhraseHashLast{$hash} = 0 unless defined $PhraseHashLast{$hash};
$PhraseHash{$hash} .= pack("w", $docid - $PhraseHashLast{$hash});
# util::dprint("<$word_b, $word> $hash\n");
$PhraseHashLast{$hash} = $docid;
}
$word_b = $word;
}
}
# Construct NMZ.p and NMZ.pi file. this processing is rather complex.
sub write_phrase_hash () {
write_phrase_hash_sub();
util::Rename($var::NMZ{'__p'}, $var::NMZ{'_p'});
util::Rename($var::NMZ{'__pi'}, $var::NMZ{'_pi'});
}
sub write_phrase_hash_sub () {
my $opened = 0;
return 0 if %PhraseHash eq "0";
util::dprint(_("doing write_phrase_hash() processing.\n"));
my $fh_tmp_pi = util::efopen(">$var::NMZ{'__pi'}");
my $fh_tmp_p = util::efopen(">$var::NMZ{'__p'}");
my $fh_phrase = util::fopen($var::NMZ{'_p'});
my $fh_phraseindex;
if ($fh_phrase) {
$fh_phraseindex = util::efopen($var::NMZ{'_pi'});
$opened = 1;
}
my $ptr = 0;
for (my $i = 0; $i < 65536; $i++) {
my $baserecord = "";
my $baseleng = 0;
if ($opened) {
my $tmp = 0;
read($fh_phraseindex, $tmp, $var::INTSIZE);
$tmp = unpack("N", $tmp);
if ($tmp != 0xffffffff) { # 0xffffffff
$baseleng = readw($fh_phrase);
read($fh_phrase, $baserecord, $baseleng);
}
}
if (defined($PhraseHash{$i})) {
if ($baserecord eq "") {
print $fh_tmp_pi pack("N", $ptr);
my $record = $PhraseHash{$i};
my $n2 = length($record);
my $data = pack("w", $n2) . $record;
print $fh_tmp_p $data;
$ptr += length($data);
} else {
print $fh_tmp_pi pack("N", $ptr);
my $record = $PhraseHash{$i};
my $last_docid = get_last_docid($baserecord, 1);
my $adjrecord = adjust_first_docid($record, $last_docid);
check_records(\$record, \$baserecord, 1) unless defined $record; # namazu-bugs-ja#31
$record = $adjrecord;
my $n2 = length($record) + $baseleng;
my $data = pack("w", $n2) . $baserecord . $record;
print $fh_tmp_p $data;
$ptr += length($data);
}
} else {
if ($baserecord eq "") {
# if $baserecord has no data, set to 0xffffffff
print $fh_tmp_pi pack("N", 0xffffffff);
} else {
print $fh_tmp_pi pack("N", $ptr);
my $data = pack("w", $baseleng) . $baserecord;
print $fh_tmp_p $data;
$ptr += length($data);
}
}
}
%PhraseHash = ();
%PhraseHashLast = ();
}
# Dr. Knuth's ``hash'' from (UNIX MAGAZINE May 1998)
sub hash ($) {
my ($word) = @_;
my $hash = 0;
for (my $i = 0; $word ne ""; $i++) {
$hash ^= $Seed[$i & 0x03][ord($word)];
$word = substr $word, 1;
# $word =~ s/^.//; is slower
}
return $hash & 65535;
}
# Count frequencies of words.
sub count_words ($$$$) {
my ($docid_count, $docid_base, $contref, $kanji) = @_;
my (@tmp);
# Normalize into small letter.
$$contref =~ tr/A-Z/a-z/;
# Do wakatigaki if necessary.
if (util::islang("ja")) {
wakati::wakatize_japanese($contref) if $kanji;
}
# Remove all symbols when -K option is specified.
$$contref =~ tr/\xa1-\xfea-z0-9/ /c if $var::Opt{'nosymbol'};
my $part1 = "";
my $part2 = "";
if ($$contref =~ /\x7f/) {
$part1 = substr $$contref, 0, index($$contref, "\x7f");
$part2 = substr $$contref, index($$contref, "\x7f");
# $part1 = $PREMATCH; # $& and friends are not efficient
# $part2 = $MATCH . $POSTMATCH;
} else {
$part1 = $$contref;
$part2 = "";
}
# do scoring
my %word_count = ();
$part2 =~ s!\x7f *(\d+) *\x7f([^\x7f]*)\x7f */ *\d+ *\x7f!
wordcount_sub($2, $1, \%word_count)!ge;
wordcount_sub($part1, 1, \%word_count);
# Add them to whole index
my $docid = $docid_count + $docid_base;
for my $word (keys(%word_count)) {
next if ($word eq "" || length($word) > $conf::WORD_LENG_MAX);
$KeyIndexLast{$word} = 0 unless defined $KeyIndexLast{$word};
$KeyIndex{$word} .= pack("w2",
$docid - $KeyIndexLast{$word},
$word_count{$word});
$KeyIndexLast{$word} = $docid;
}
}
#
# Count words and do score weighting
#
sub wordcount_sub ($$\%) {
my ($text, $weight, $word_count) = @_;
# Count frequencies of words in a current document.
# Handle symbols as follows.
#
# tcp/ip -> tcp/ip, tcp, ip
# (tcp/ip) -> (tcp/ip), tcp/ip, tcp, ip
# ((tcpi/ip)) -> ((tcp/ip)), (tcp/ip), tcp
#
# Don't do processing for nested symbols.
# NOTE: When -K is specified, all symbols are already removed.
my @words = split /\s+/, $text;
for my $word (@words) {
next if $word eq "";
if ($var::Opt{'noedgesymbol'}) {
# remove symbols at both ends
$word =~ s/^[^\xa1-\xfea-z_0-9]*(.*?)[^\xa1-\xfea-z_0-9]*$/$1/g;
}
$word_count->{$word} = 0 unless defined($word_count->{$word});
$word_count->{$word} += $weight;
unless ($var::Opt{'nosymbol'}) {
if ($word =~ /^[^\xa1-\xfea-z_0-9](.+)[^\xa1-\xfea-z_0-9]$/) {
$word_count->{$1} = 0 unless defined($word_count->{$1});
$word_count->{$1} += $weight;
next unless $1 =~ /[^\xa1-\xfea-z_0-9]/;
} elsif ($word =~ /^[^\xa1-\xfea-z_0-9](.+)/) {
$word_count->{$1} = 0 unless defined($word_count->{$1});
$word_count->{$1} += $weight;
next unless $1 =~ /[^\xa1-\xfea-z_0-9]/;
} elsif ($word =~ /(.+)[^\xa1-\xfea-z_0-9]$/) {
$word_count->{$1} = 0 unless defined($word_count->{$1});
$word_count->{$1} += $weight;
next unless $1 =~ /[^\xa1-\xfea-z_0-9]/;
}
my @words_ = split(/[^\xa1-\xfea-z_0-9]+/, $word)
if $word =~ /[^\xa1-\xfea-z_0-9]/;
for my $tmp (@words_) {
next if $tmp eq "";
$word_count->{$tmp} = 0 unless defined($word_count->{$tmp});
$word_count->{$tmp} += $weight;
}
@words_ = ();
}
}
return "";
}
# Construct NMZ.i and NMZ.ii file. this processing is rather complex.
sub write_index () {
my $key_count = write_index_sub();
util::Rename($var::NMZ{'__i'}, $var::NMZ{'_i'});
util::Rename($var::NMZ{'__w'}, $var::NMZ{'_w'});
return $key_count;
}
# readw: read one pack 'w' word.
# This code was contributed by <furukawa@tcp-ip.or.jp>.
sub readw ($) {
my $fh = shift;
my $ret = '';
my $c;
while (read($fh, $c, 1)){
$ret .= $c;
last unless 0x80 & ord $c;
}
return unpack('w', $ret);
}
sub get_last_docid ($$) {
my ($record, $step) = @_;
my (@data) = unpack 'w*', $record;
my $sum = 0;
for (my $i = 0; $i < @data; $i += $step) {
$sum += $data[$i];
}
my $leng = @data / $step;
return $sum;
}
sub adjust_first_docid ($$) {
my ($record, $last_docid) = @_;
my (@data) = unpack 'w*', $record;
$data[0] = $data[0] - $last_docid;
return undef if ($data[0] < 0); # namazu-bug-ja#31
$record = pack 'w*', @data;
return $record;
}
sub write_index_sub () {
my @words = sort keys(%KeyIndex);
return 0 if $#words == -1;
my $cnt = 0;
my $ptr_i = 0;
my $ptr_w = 0;
my $key_count = 0;
my $baserecord = "";
util::dprint(_("doing write_index() processing.\n"));
my $fh_tmp_i = util::efopen(">$var::NMZ{'__i'}");
my $fh_tmp_w = util::efopen(">$var::NMZ{'__w'}");
my $fh_i = util::fopen($var::NMZ{'_i'});
my $fh_ii = util::efopen(">$var::NMZ{'_ii'}");
my $fh_w = util::fopen($var::NMZ{'_w'});
my $fh_wi = util::efopen(">$var::NMZ{'_wi'}");
if ($fh_w) {
FOO:
while (defined(my $line = <$fh_w>)) {
chop $line;
my $current_word = $line;
my $baseleng = readw($fh_i);
read($fh_i, $baserecord, $baseleng);
for (; $cnt < @words; $cnt++) {
last unless $words[$cnt] le $current_word;
my $record = $KeyIndex{$words[$cnt]};
my $leng = length($record);
if ($current_word eq $words[$cnt]) {
my $last_docid = get_last_docid($baserecord, 2);
my $adjrecord = adjust_first_docid($record, $last_docid);
check_records(\$record, \$baserecord, 2) unless defined $record; # namazu-bugs-ja#31
$record = $adjrecord;
$leng = length($record); # re-measure
my $tmp = pack("w", $leng + $baseleng);
my $data_i = "$tmp$baserecord$record";
my $data_w = "$current_word\n";
print $fh_tmp_i $data_i;
print $fh_tmp_w $data_w;
print $fh_ii pack("N", $ptr_i);
print $fh_wi pack("N", $ptr_w);
$ptr_i += length($data_i);
$ptr_w += length($data_w);
$key_count++;
$cnt++;
next FOO;
} else {
my $tmp = pack("w", $leng);
my $data_i = "$tmp$record";
my $data_w = "$words[$cnt]\n";
print $fh_tmp_i $data_i;
print $fh_tmp_w $data_w;
print $fh_ii pack("N", $ptr_i);
print $fh_wi pack("N", $ptr_w);
$ptr_i += length($data_i);
$ptr_w += length($data_w);
$key_count++;
}
}
my $tmp = pack("w", $baseleng);
my $data_i = "$tmp$baserecord";
my $data_w = "$current_word\n";
print $fh_tmp_i $data_i;
print $fh_tmp_w $data_w;
print $fh_ii pack("N", $ptr_i);
print $fh_wi pack("N", $ptr_w);
$ptr_i += length($data_i);
$ptr_w += length($data_w);
$key_count++;
}
}
while ($cnt < @words) {
my $leng = length($KeyIndex{$words[$cnt]});
my $tmp = pack("w", $leng);
my $record = $KeyIndex{$words[$cnt]};
my $data_i = "$tmp$record";
my $data_w = "$words[$cnt]\n";
print $fh_tmp_i $data_i;
print $fh_tmp_w $data_w;
print $fh_ii pack("N", $ptr_i);
print $fh_wi pack("N", $ptr_w);
$ptr_i += length($data_i);
$ptr_w += length($data_w);
$key_count++;
$cnt++;
}
%KeyIndex = ();
%KeyIndexLast = ();
return $key_count;
}
#
# Decide the media type.
# FIXME: Very ad hoc. It's just a compromise. -- satoru
#
sub decide_type ($$) {
my ($name, $cont) = @_;
return $name if (!defined $cont || $name eq $cont);
util::dprint("decide_type: name: $name, cont: $cont\n");
if ($cont =~ m!^text/plain! && $name =~ m!^text/plain!) {
return $name;
} elsif ($cont =~ m!^application/octet-stream!) {
return $name;
} elsif ($cont =~ m!^application/(excel|powerpoint|msword)! &&
$name !~ m!^application/octet-stream!) {
# FIXME: Currently File::MMagic 1.02's checktype_data()
# is unreliable for them.
return $name;
}
return $cont;
}
#
# Debugging code for the "negative numbers" problem.
#
sub check_records ($$$) {
my ($recref, $baserecref, $step) = @_;
dump_record($baserecref, $step);
dump_record($recref, $step);
print STDERR "The \x22negative number\x22 problem occurred.\n";
exit(1);
}
sub dump_record($$) {
my ($recref, $step) = @_;
my (@data) = unpack 'w*', $$recref;
print STDERR "dump record data to NMZ.bug.info (step: $step)...";
my $fh_info = util::fopen(">> NMZ.bug.info");
print $fh_info "dumped record data (step: $step)...";
foreach (@data) {
print $fh_info sprintf(" %08x", $_);
}
print $fh_info "\n";
return;
}
#
# For avoiding "used only once: possible typo at ..." warnings.
#
muda($conf::ON_MEMORY_MAX,
$conf::WORD_LENG_MAX, $conf::TEXT_SIZE_MAX,
$conf::DENY_FILE, $var::INTSIZE,
$conf::CHASEN_NOUN, $conf::CHASEN,
$conf::KAKASI, $var::Opt{'okurigana'},
$var::Opt{'hiragana'}, $conf::DIRECTORY_INDEX,
$usage::USAGE, $var::Opt{'noheadabst'}, $usage::VERSION_INFO,
$var::Opt{'noencodeurl'}, $conf::HTML_SUFFIX,
$var::RECURSIVE_ACTIONS, $conf::META_TAGS, $var::USE_NKF_MODULE,
$conf::ADDRESS, $var::MAILING_ADDRESS,
$conf::FILE_SIZE_MAX,
);
sub muda {}